library(dataQualityR) #for data cleaning
library(e1071) #SVM Training & Testing Models
library(mice) #deal with missing data
library(corrplot) #plot for correlation matrix
library(ggplot2) #visualization and plots
library(ggpubr) #customizing ggplot2
library(scales) #graphical scales map data to aesthetics
library(caret) #deal with Classification And REgression Training CART
library(dplyr) #data manipulation: filter and arrange
library(tidyverse) #data manipulation
library(sf) #plot mapping
library(gganimate) #static visualization
library(MASS) #deal with data set
library(VIM) #tools for the visualization of missing or imputed values
library(glmnet) #Lasso and Elastic-Net Regularization

Outline:

  1. Introduction - Business Context
  2. Data Preparation 2.1. Data collection 2.2. Data Cleaning 2.3. Correlation Analysis
  3. Exploratory Data Analysis - EDA
  4. Modeling
  5. Evaluation

1. Introduction

Ce projet vise à déterminer les caractéristiques qui sont les meilleurs indicateurs de la qualité du vin rouge et à générer un aperçu de chacun de ces facteurs pour la qualité du vin rouge de notre modèle. Par la suite, ce modèle pourra servir les entreprises du secteur à prédire la qualité de leur vin.

2. Data Preparation

2.2. Data cleaning

  • Data reading
df<-read.csv("winequality-red.csv")
dim(df)
## [1] 1599   12
  • Data overview
str(df)
## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
summary(df)
##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
  • data report pour cat et num
num.file <- paste(getwd(), "/dqames_num.csv", sep= "")
cat.file <- paste(getwd(), "/dqames_cat.csv", sep= "")
checkDataQuality(data= df, out.file.num= num.file, out.file.cat= cat.file)
## Check for numeric variables completed // Results saved to disk // Time difference of 0.02191997 secs
##  // Time difference of 0.000003099442 secs
  • Vérification des données manquantes
aggr_plot <- aggr(df, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(df), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##              Variable Count
##         fixed.acidity     0
##      volatile.acidity     0
##           citric.acid     0
##        residual.sugar     0
##             chlorides     0
##   free.sulfur.dioxide     0
##  total.sulfur.dioxide     0
##               density     0
##                    pH     0
##             sulphates     0
##               alcohol     0
##               quality     0

2.3. Correlation analysis

  • Matrice de corrélation
set.seed(123)
library(dplyr)
library(tidyr)

cor(df) %>%
  as.data.frame() %>%
  mutate(var1 = rownames(.)) %>%
  gather(var2, value, -var1) %>%
  arrange(desc(value)) %>%
  group_by(value) %>%
  filter(row_number()==1)
## # A tibble: 67 × 3
## # Groups:   value [67]
##    var1                 var2                value
##    <chr>                <chr>               <dbl>
##  1 fixed.acidity        fixed.acidity       1    
##  2 citric.acid          fixed.acidity       0.672
##  3 density              fixed.acidity       0.668
##  4 total.sulfur.dioxide free.sulfur.dioxide 0.668
##  5 quality              alcohol             0.476
##  6 sulphates            chlorides           0.371
##  7 density              citric.acid         0.365
##  8 density              residual.sugar      0.355
##  9 sulphates            citric.acid         0.313
## 10 quality              sulphates           0.251
## # … with 57 more rows
dfcor <- cor(df)
corrplot(dfcor, method = "color", addCoef.col = "black",number.cex = .6,
         tl.col = "black", tl.srt = 90, diag = FALSE)

  • Construciton des corrélation atts avec la variable Quality
dfcor <- cor(df)
quality_cor <- dfcor[,12]
absoutcome_cor <- abs(quality_cor)
head(absoutcome_cor[order(absoutcome_cor, decreasing = TRUE)],12)
##              quality              alcohol     volatile.acidity 
##           1.00000000           0.47616632           0.39055778 
##            sulphates          citric.acid total.sulfur.dioxide 
##           0.25139708           0.22637251           0.18510029 
##              density            chlorides        fixed.acidity 
##           0.17491923           0.12890656           0.12405165 
##                   pH  free.sulfur.dioxide       residual.sugar 
##           0.05773139           0.05065606           0.01373164

3. Exploratory Data Analysis - EDA

3.1. Overall Quality

q1 <- ggplot(df, aes(quality))+ 
  geom_histogram() + 
  labs(title = "Histogram of quality") + 
  theme(plot.title=element_text(hjust=0.5)) +
  geom_vline(aes(xintercept=mean(quality)), color="blue", linetype="dashed", size=1) +
  geom_text(aes(x=5.6, label="Mean Value", y=400), colour="red", angle=90, vjust = 1.2, text=element_text(size=11))
q1

q2 <- ggplot(df, aes(sample=quality)) +
  stat_qq(color="dodgerblue4") + 
  stat_qq_line(color="red") +
  scale_y_continuous(labels=function(y){y/10^6}) +
  labs(title="QQ Plot for quality", y="Ordered Values") +
  theme(plot.title=element_text(hjust=0.5))
q2

3.2. graphs des distributions marginales de quantités numériques clés

p1 <- ggplot(df, aes(x=fixed.acidity)) + 
  geom_density()
p1 + geom_vline(aes(xintercept=mean(fixed.acidity)),
            color="blue", linetype="dashed", size=1)

p2 <- ggplot(df, aes(x=volatile.acidity)) + 
  geom_density()
p2 + geom_vline(aes(xintercept=mean(volatile.acidity)),
            color="blue", linetype="dashed", size=1)

p3 <- ggplot(df, aes(x=citric.acid)) + 
  geom_density()
p3 + geom_vline(aes(xintercept=mean(citric.acid)),
            color="blue", linetype="dashed", size=1)

p4 <- ggplot(df, aes(x=residual.sugar)) + 
  geom_density()
p4 + geom_vline(aes(xintercept=mean(residual.sugar)),
            color="blue", linetype="dashed", size=1)

p5 <- ggplot(df, aes(x=chlorides)) + 
  geom_density()
p5 + geom_vline(aes(xintercept=mean(chlorides)),
            color="blue", linetype="dashed", size=1)

p6 <- ggplot(df, aes(x=free.sulfur.dioxide)) + 
  geom_density()
p6 + geom_vline(aes(xintercept=mean(free.sulfur.dioxide)),
            color="blue", linetype="dashed", size=1)

p7 <- ggplot(df, aes(x=total.sulfur.dioxide)) + 
  geom_density()
p7 + geom_vline(aes(xintercept=mean(total.sulfur.dioxide)),
            color="blue", linetype="dashed", size=1)

p8 <- ggplot(df, aes(x=density)) + 
  geom_density()
p8 + geom_vline(aes(xintercept=mean(density)),
            color="blue", linetype="dashed", size=1)

p9 <- ggplot(df, aes(x=pH)) + 
  geom_density()
p9 + geom_vline(aes(xintercept=mean(pH)),
            color="blue", linetype="dashed", size=1)

p10 <- ggplot(df, aes(x=sulphates)) + 
  geom_density()
p10 + geom_vline(aes(xintercept=mean(sulphates)),
            color="blue", linetype="dashed", size=1)

p11 <- ggplot(df, aes(x=alcohol)) + 
  geom_density()
p11 + geom_vline(aes(xintercept=mean(alcohol)),
            color="blue", linetype="dashed", size=1)

ggarrange(p1, p2, p3, p4, nrow = 2, ncol =2)

ggarrange(p5, p6, p7, p8, nrow = 2, ncol =2)

ggarrange(p9, p10, p11, nrow = 2, ncol =2)

b1 <- boxplot(df$fixed.acidity, col="slategray2", pch=19)

b2 <- boxplot(df$volatile.acidity, col="slategray2", pch=19)

b3 <- boxplot(df$citric.acid, col="slategray2", pch=19)

b4 <- boxplot(df$residual.sugar, col="slategray2", pch=19)

b5 <- boxplot(df$chlorides, col="slategray2", pch=19)

b6 <- boxplot(df$free.sulfur.dioxide, col="slategray2", pch=19)

b7 <- boxplot(df$total.sulfur.dioxide, col="slategray2", pch=19)

b8 <- boxplot(df$density, col="slategray2", pch=19)

b9 <- boxplot(df$pH, col="slategray2", pch=19)

b10 <- boxplot(df$sulphates, col="slategray2", pch=19)

b11 <- boxplot(df$alcohol, col="slategray2", pch=19)

3.3. Contrôle de la qualité par rapport à des variables numériques

g1 <- ggplot(df, aes(factor(quality), fixed.acidity, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "quality", y = "fixed.acidity", title = "Boxplot of Quality vs. fixed.acidity") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g1

g2 <- ggplot(df, aes(factor(quality), volatile.acidity, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "quality", y = "volatile.acidity", title = "Boxplot of Quality vs. volatile.acidity") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g2

g3 <- ggplot(df, aes(factor(quality), citric.acid, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "quality", y = "citric.acid", title = "Boxplot of Quality vs. citric.acid") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g3

g4 <- ggplot(df, aes(factor(quality), residual.sugar, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "quality", y = "residual.sugar", title = "Boxplot of Quality vs. residual.sugar") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g4

ggarrange(g1, g2, g3, g4, nrow = 2, ncol =2)

# Il semble qu'il y ait une relation positive entre citric acid et quality.
# Il semble qu'il y ait une relation négative entre volatile acidity et quality.
g5 <- ggplot(df, aes(factor(quality), chlorides, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "Quality", y = "chlorides", title = "Boxplot of Quality vs. chlorides") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g5

g6 <- ggplot(df, aes(factor(quality), free.sulfur.dioxide, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "quality", y = "free.sulfur.dioxide", title = "Boxplot of quality vs. free.sulfur.dioxide") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g6

g7 <- ggplot(df, aes(factor(quality), total.sulfur.dioxide, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "quality", y = "total.sulfur.dioxide", title = "Boxplot of quality vs. total.sulfur.dioxide") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g7

g8 <- ggplot(df, aes(factor(quality), density, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "quality", y = "density", title = "Boxplot of quality vs. density") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g8

ggarrange(g5, g6, g7, g8, nrow = 2, ncol =2)

# Il semble qu'il y ait une relation négative entre density acid et quality.
g9 <- ggplot(df, aes(factor(quality), pH, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "quality", y = "pH", title = "Boxplot of Quality vs. pH") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g9

g10 <- ggplot(df, aes(factor(quality), sulphates, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "quality", y = "sulphates", title = "Boxplot of quality vs. sulphates") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g10

g11 <- ggplot(df, aes(factor(quality), alcohol, fill=factor(quality))) + 
  geom_boxplot() +
  labs(x = "quality", y = "alcohol", title = "Boxplot of quality vs. alcohol") + 
  theme(legend.position = 'none', plot.title = element_text(size = 9, hjust=0.5))
g11

ggarrange(g9, g10, g11, nrow = 2, ncol =2)

# Il semble qu'il y ait une relation positive entre alcohol et quality.
# Il semble qu'il y ait une relation positive entre sulphates et quality. 

3.3. Inspection de la relation entre les variables numériques

s1 <- ggplot(df, aes(x=fixed.acidity, y=citric.acid)) +
  geom_point(color="dodgerblue4",size=0.7) + 
  labs(title="fixed.acidity vs. citric.acid") +
  geom_smooth(formula=y~x,method=lm, color="red") +
  theme(plot.title=element_text(hjust=0.5))
s1

s2 <- ggplot(df, aes(x=fixed.acidity, y=density)) +
  geom_point(color="dodgerblue4",size=0.7) + 
  labs(title="fixed.acidity vs. density") +
  geom_smooth(formula=y~x,method=lm, color="red") +
  theme(plot.title=element_text(hjust=0.5))
s2

s3 <- ggplot(df, aes(x=free.sulfur.dioxide, y=total.sulfur.dioxide)) +
  geom_point(color="dodgerblue4",size=0.7) + 
  labs(title="free.sulfur.dioxide vs. total.sulfur.dioxide") +
  geom_smooth(formula=y~x,method=lm, color="red") +
  theme(plot.title=element_text(hjust=0.5))
s3

s4 <- ggplot(df, aes(x=fixed.acidity, y=pH)) +
  geom_point(color="dodgerblue4",size=0.7) + 
  labs(title="fixed.acidity vs. pH") +
  geom_smooth(formula=y~x,method=lm, color="red") +
  theme(plot.title=element_text(hjust=0.5))
s4

ggarrange(s1, s2, s3, s4, nrow = 2, ncol =2)

# Il semble qu'il y ait une relation positive entre fixed.acidity et citric.acid
# Il semble qu'il y ait une relation positive entre fixed.acidity et density
# Il semble qu'il y ait une relation positive entre free.sulfur.dioxide et total.sulfur.dioxide
# Il semble qu'il y ait une relation negative entre fixed.acidity et pH

3.4. Inspection 3D

i1<- ggplot(df, aes(x=factor(round(alcohol)), y=citric.acid)) + 
  geom_boxplot(aes(colour = factor(quality))) +
  labs(title="Alcohol + Citric.Acid vs. Quality") + 
  theme(plot.title=element_text(hjust=0.5))
i1

i2 <- ggplot(df, aes(x=factor(round(alcohol)), y=volatile.acidity)) + 
  geom_boxplot(aes(colour = factor(quality))) +
  labs(title="Alcohol + Volatile.Acidity vs. Quality") + 
  theme(plot.title=element_text(hjust=0.5))
i2

i3 <- ggplot(df, aes(x=factor(round(alcohol)), y=chlorides)) + 
  geom_boxplot(aes(colour = factor(quality))) +
  labs(title="Alcohol + Chlorides vs. Quality") + ylim(0, 0.3)
  theme(plot.title=element_text(hjust=0.5))
## List of 1
##  $ plot.title:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0.5
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
i3

4. Modeling

4.1. Modélisation avec les 5 principales variables

4.1. Modèle 1 avec top 5 plus haute corrélation avec TotalIncidents

lm0 <- lm(quality ~ alcohol + volatile.acidity +  sulphates + citric.acid + total.sulfur.dioxide, data = df)
summary(lm0)
## 
## Call:
## lm(formula = quality ~ alcohol + volatile.acidity + sulphates + 
##     citric.acid + total.sulfur.dioxide, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.72463 -0.38380 -0.06689  0.44606  2.14550 
## 
## Coefficients:
##                        Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)           2.8431068  0.2050732  13.864 < 0.0000000000000002 ***
## alcohol               0.2953419  0.0160375  18.416 < 0.0000000000000002 ***
## volatile.acidity     -1.2223102  0.1124774 -10.867 < 0.0000000000000002 ***
## sulphates             0.7207881  0.1027039   7.018     0.00000000000332 ***
## citric.acid          -0.0427246  0.1035810  -0.412                 0.68    
## total.sulfur.dioxide -0.0022182  0.0005126  -4.327     0.00001602753699 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6552 on 1593 degrees of freedom
## Multiple R-squared:  0.3439, Adjusted R-squared:  0.3418 
## F-statistic:   167 on 5 and 1593 DF,  p-value: < 0.00000000000000022
lm1 <- lm(quality ~ alcohol + volatile.acidity +  sulphates + total.sulfur.dioxide, data = df)
summary(lm1)
## 
## Call:
## lm(formula = quality ~ alcohol + volatile.acidity + sulphates + 
##     total.sulfur.dioxide, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.72716 -0.38486 -0.06503  0.44980  2.13257 
## 
## Coefficients:
##                        Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)           2.8258128  0.2006892  14.081 < 0.0000000000000002 ***
## alcohol               0.2953105  0.0160331  18.419 < 0.0000000000000002 ***
## volatile.acidity     -1.1985632  0.0966011 -12.407 < 0.0000000000000002 ***
## sulphates             0.7121396  0.1005146   7.085     0.00000000000208 ***
## total.sulfur.dioxide -0.0022354  0.0005108  -4.376     0.00001284518270 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.655 on 1594 degrees of freedom
## Multiple R-squared:  0.3438, Adjusted R-squared:  0.3421 
## F-statistic: 208.8 on 4 and 1594 DF,  p-value: < 0.00000000000000022
# Define training control
set.seed(123) 
train.control <- trainControl(method = "cv", number = 10) #cv Cross-Validation
# Train the model
model1 <- train(quality ~ alcohol + volatile.acidity +  sulphates + total.sulfur.dioxide, data = df, method = "lm", trControl = train.control)
# Summarize the results
summary(model1)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.72716 -0.38486 -0.06503  0.44980  2.13257 
## 
## Coefficients:
##                        Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)           2.8258128  0.2006892  14.081 < 0.0000000000000002 ***
## alcohol               0.2953105  0.0160331  18.419 < 0.0000000000000002 ***
## volatile.acidity     -1.1985632  0.0966011 -12.407 < 0.0000000000000002 ***
## sulphates             0.7121396  0.1005146   7.085     0.00000000000208 ***
## total.sulfur.dioxide -0.0022354  0.0005108  -4.376     0.00001284518270 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.655 on 1594 degrees of freedom
## Multiple R-squared:  0.3438, Adjusted R-squared:  0.3421 
## F-statistic: 208.8 on 4 and 1594 DF,  p-value: < 0.00000000000000022
print(model1)
## Linear Regression 
## 
## 1599 samples
##    4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1439, 1439, 1438, 1439, 1439, 1440, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.6560876  0.3429191  0.5099489
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

4.2. LASSO

library(glmnet)
x <- model.matrix(quality~., df)[,-1]
y <- df$quality
mod <- cv.glmnet(as.matrix(x), y, alpha=1)
as.matrix(coef(mod, mod$lambda.min))
##                                 s1
## (Intercept)           18.731818444
## fixed.acidity          0.020774482
## volatile.acidity      -1.078204434
## citric.acid           -0.164074955
## residual.sugar         0.014319348
## chlorides             -1.867641097
## free.sulfur.dioxide    0.004233406
## total.sulfur.dioxide  -0.003228361
## density              -14.593604735
## pH                    -0.422494118
## sulphates              0.906073568
## alcohol                0.278563263
as.matrix(coef(mod, mod$lambda.1se))
##                                 s1
## (Intercept)           3.1453849170
## fixed.acidity         0.0018155594
## volatile.acidity     -1.0221755767
## citric.acid           0.0000000000
## residual.sugar        0.0000000000
## chlorides            -0.2350727376
## free.sulfur.dioxide   0.0000000000
## total.sulfur.dioxide -0.0009327824
## density               0.0000000000
## pH                    0.0000000000
## sulphates             0.4961365592
## alcohol               0.2640733125
CF <- as.matrix(coef(mod, mod$lambda.1se))
CF[CF!=0,]
##          (Intercept)        fixed.acidity     volatile.acidity 
##         3.1453849170         0.0018155594        -1.0221755767 
##            chlorides total.sulfur.dioxide            sulphates 
##        -0.2350727376        -0.0009327824         0.4961365592 
##              alcohol 
##         0.2640733125
lm2 <- lm(quality ~ fixed.acidity + volatile.acidity + chlorides + total.sulfur.dioxide + sulphates + alcohol, data=df)
summary(lm2)
## 
## Call:
## lm(formula = quality ~ fixed.acidity + volatile.acidity + chlorides + 
##     total.sulfur.dioxide + sulphates + alcohol, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.70812 -0.37181 -0.06238  0.45933  1.99472 
## 
## Coefficients:
##                        Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)           2.7365412  0.2325021  11.770 < 0.0000000000000002 ***
## fixed.acidity         0.0236576  0.0099187   2.385               0.0172 *  
## volatile.acidity     -1.0856214  0.0996323 -10.896 < 0.0000000000000002 ***
## chlorides            -1.7376885  0.3913566  -4.440  0.00000960779597327 ***
## total.sulfur.dioxide -0.0021460  0.0005121  -4.191  0.00002933553690691 ***
## sulphates             0.8846921  0.1108310   7.982  0.00000000000000272 ***
## alcohol               0.2825603  0.0166180  17.003 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6504 on 1592 degrees of freedom
## Multiple R-squared:  0.3538, Adjusted R-squared:  0.3514 
## F-statistic: 145.3 on 6 and 1592 DF,  p-value: < 0.00000000000000022
# Define training control
set.seed(123) 
train.control <- trainControl(method = "cv", number = 10) #cv Cross-Validation
# Train  model
model2<- train(quality ~ fixed.acidity + volatile.acidity + chlorides + total.sulfur.dioxide + sulphates + alcohol,data = df, method = "lm",
               trControl = train.control)
#results
summary(model2)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.70812 -0.37181 -0.06238  0.45933  1.99472 
## 
## Coefficients:
##                        Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)           2.7365412  0.2325021  11.770 < 0.0000000000000002 ***
## fixed.acidity         0.0236576  0.0099187   2.385               0.0172 *  
## volatile.acidity     -1.0856214  0.0996323 -10.896 < 0.0000000000000002 ***
## chlorides            -1.7376885  0.3913566  -4.440  0.00000960779597327 ***
## total.sulfur.dioxide -0.0021460  0.0005121  -4.191  0.00002933553690691 ***
## sulphates             0.8846921  0.1108310   7.982  0.00000000000000272 ***
## alcohol               0.2825603  0.0166180  17.003 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6504 on 1592 degrees of freedom
## Multiple R-squared:  0.3538, Adjusted R-squared:  0.3514 
## F-statistic: 145.3 on 6 and 1592 DF,  p-value: < 0.00000000000000022
print(model2)
## Linear Regression 
## 
## 1599 samples
##    6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1439, 1439, 1438, 1439, 1439, 1440, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE    
##   0.6525392  0.3496559  0.50698
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

4.3 Random Forest Model 3

library(randomForest)
library(mlbench)
library(caret) # use createDataPartition() function 
set.seed(95014)

# partition 
#Evaluation Sets
set.seed(123)
n = nrow(df)
trainIndex = sample(1:n, size = round(0.7*n), replace=FALSE)
#Crée des dataframe d'entrainemet et des test à partir d'observations
training = df[trainIndex,]
testing = df[-trainIndex,]

model3 <- randomForest(quality ~ ., training, mtry = 3, 
                         importance = TRUE, na.action = na.omit)
print(model3)
## 
## Call:
##  randomForest(formula = quality ~ ., data = training, mtry = 3,      importance = TRUE, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.3669176
##                     % Var explained: 46.23
#le graphique de l'erreur en fonction du nombre d'arbres.
plot(model3) 

varImp(model3)
##                       Overall
## fixed.acidity        20.71249
## volatile.acidity     29.63074
## citric.acid          21.53189
## residual.sugar       14.61027
## chlorides            22.20528
## free.sulfur.dioxide  20.00519
## total.sulfur.dioxide 28.35124
## density              28.98152
## pH                   20.88894
## sulphates            44.31574
## alcohol              49.35817
varImpPlot(model3,type=2)

# obtenir MSE à partir du dernier élément dans fit$mse
# qui devrait correspondre à la sortie de l'impression
model3$mse[length(model3$mse)]
## [1] 0.3669176
# prendre la racine carrée pour calculer la RMSE du modèle
sqrt(model3$mse[length(model3$mse)])
## [1] 0.6057372
# illustrons maintenant comment calculer le RMSE sur les données de test par rapport aux données de formation.
predValues <- predict(model3,testing)

# nous pouvons le calculer directement 
sqrt(mean((testing$quality -predValues)^2)) #RMSE
## [1] 0.5506011
mean(abs(testing$quality -predValues)) #MAE
## [1] 0.4104835

Evaluation dataframe

Model <- c("Model 1", "Model 2", "Model 3")
R_squared <- c(0.3479, 0.3546, 0.4850)
RMSE <- c(0.6549, 0.6515, 0.5843)
MAE <- c(0.5092899, 0.5063, 0.4222)
ml <- data.frame(Model, R_squared, RMSE, MAE)

Plot

library(gridExtra)
p1 <- ggplot(ml, aes(Model, RMSE)) + geom_point(aes(colour = factor(Model), size = 4)) + labs(title="RMSE") + theme(plot.title=element_text(hjust=0.5), axis.title.y = element_blank(),axis.title.x = element_blank(), legend.position="none")
p2 <- ggplot(ml, aes(Model, R_squared)) + geom_point(aes(colour = factor(Model), size = 4)) + labs(title="R-Squared") + theme(plot.title=element_text(hjust=0.5), axis.title.y = element_blank(),axis.title.x = element_blank(), legend.position="none")
p3 <- ggplot(ml, aes(Model, MAE)) + geom_point(aes(colour = factor(Model), size = 4)) + labs(title="MAE") + theme(plot.title=element_text(hjust=0.5), axis.title.y = element_blank(),axis.title.x = element_blank(), legend.position="none")
grid.arrange(p2,p1,p3, ncol=3)